************************************
*********** PROJECT INFO ***********
************************************

// Record Linkage for Character-Based Names
// Create dictionary crosswalk
// Author: Hannah Postel
// Date: 10/26/2022

**************************************
************** STEP ONE **************
******* IMPORT UNIHAN DATABASE *******
**************************************

// The Unihan database contains data on all CJK ideographs
// We're going to use pronunciations, definitions, other encodings, char. frequency

// 'dictionary-like' data
import delimited "$data/Unihan_DictionaryLikeData.txt", encoding(UTF-8) clear

// select variables, reshape, format
format v3 %18s
keep if v2=="kFourCornerCode" | v2=="kFrequency"
reshape wide v3, i(v1) j(v2) string
ren v1 utf
ren v3kFourCornerCode k4corner
ren v3kFrequency kFrequency

tempfile temp
save `temp', replace

// 'readings' data
// definition, cantonese and mandarin pronunciations
import delimited "$data/Unihan_Readings.txt", encoding(UTF-8) clear

// select variables, reshape, format
keep if v2=="kCantonese" | v2=="kDefinition" | v2=="kMandarin" 
reshape wide v3, i(v1) j(v2) string
ren v1 utf
drop if v3kDefinition=="" & v3kMandarin=="" // 3500 just cantonese
ren v3kDefinition kDef
ren v3kMandarin kPY
ren v3kCantonese kCantonese

// create full dataset
// n=42,472
merge 1:1 utf using `temp'
drop _merge
save "$data/unihan-all.dta", replace

// for now, keep only those with four-corner mappings
// this is how i'm identifying characters from the ny database
// n=16,256 (38%)
keep if k4corner!=""
format kDef %64s

// situ not in unihan
// add observation to the end and input info
insobs 1, after(_N)
replace k4corner="1762.0+2428.1" if _n==16257
replace utf="U+53F8 U+5F92" if _n==16257
replace kCantonese="si1 tou4" if _n==16257
replace kPY="sī tú" if _n==16257
replace kDef="minister of education (history) / two-character surname Situ" if _n==16257
// assign non-empty frequency so it doesn't get dropped later
replace kFrequency="x" if _n==16257

// generate  first initial for merging with NYC
gen FI_c = substr(kCantonese,1,1)
replace FI_c=upper(FI_c)
gen FI_m = substr(kPY,1,1)
replace FI_m=upper(FI_m)
gen FI=FI_c if FI_c==FI_m

// keep those with either kfrequency OR those with surname in definition
// all of them should have frequencies listed b/c fairly common
// n=5,174
keep if strpos(kDef, "surname")>0 | kFrequency!=""

// drop those without decimal places since i just added mine
// n=4,168
drop if strpos(k4corner, ".")==0

save "$data/unihan-merge.dta", replace

*********************************
****** NYC EXCLUSION FILES ******
*********************************

// index of ny exclusion files, including romanization and four-character code
// we use these to make a crosswalk between romanized spelling and character equivalent
// includes prevalence of each character per romanization

import excel "$data/NYC index.xlsx", sheet("all") firstrow clear

drop if CHINESE1=="" // drops 7,594, n=10,959
contract CHINESE1 NAME1 // 2088 mappings

// remove "mrs" monikers & remove resulting dupes
replace NAME1=subinstr(NAME1,"MRS","",.)
replace NAME1 = strtrim(NAME1)
duplicates drop NAME1 CHINESE1, force

// make all caps & remove resulting dupes
replace NAME1=upper(NAME1)
duplicates drop NAME1 CHINESE1, force

// generate "first initial" for distinguishing among multiple mapped characters
// sometimes also have to specify different utf codes if still the same 5-digit number
gen FI = substr(NAME1,1,1)
gen utf ="."

// generate string length variable for help assigning characters
gen length = length(NAME1)
drop if length==1
drop if length==2 & strpos(NAME1, ".")>0

// remove non-chinese names.
drop if NAME1=="GEORGE" | NAME1=="ALFRED" |  NAME1=="AH" |  NAME1=="EDWARD" | ///
NAME1=="HAROLD" | NAME1=="HARRY" | NAME1=="MORIS" | NAME1=="ROBERT" | NAME1=="RUTH" | ///
NAME1=="VIRGINIA" | NAME1=="WILLIE" | NAME1=="FRANKLIN" | NAME1=="JAMES" | NAME1=="ALEXANDER" | ///
NAME1=="JOSEPH" | NAME1=="MADELINE" | NAME1=="BENJAMIN" | NAME1=="EUGENE" |  NAME1=="HENRY" | ///
NAME1=="LOUIS" | NAME1=="LILLIAN" | NAME1=="WILLIAM" | NAME1=="ANNA" | NAME1=="ANDREW" |  ///
NAME1=="VICTOR" | NAME1=="CHARLES" | NAME1=="CHARLIE" | NAME1=="CYRIL" | NAME1=="AILEEN" | ///
NAME1=="AMY" |  NAME1=="LAWRENCE" | NAME1=="RICHARD" | NAME1=="ALICE" |  NAME1=="SAMUEL" | ///
NAME1=="FRANCES" | NAME1=="MARGARET" | NAME1=="EDGAR" | NAME1=="KENNETH" | NAME1=="KATHERINE" | ///
NAME1=="MARY" | NAME1=="HELEN" | NAME1=="ISAAC" | NAME1=="CHARLEY" | NAME1=="DANIEL" | ///
NAME1=="DONALD" | NAME1=="GRACE" | NAME1=="GLADYS" | NAME1=="IRVING" | NAME1=="LIVINGSTON" | ///
NAME1=="NELLIE" | NAME1=="JOHN" | NAME1=="HERBERT" | NAME1=="FLORENCE" | NAME1=="BETTY" | ///
NAME1=="ARTHUR" | NAME1=="DICK" | NAME1=="EVERETT" | NAME1=="JAKE" | NAME1=="MILTON" | ///
NAME1=="TOM" | NAME1=="LIONEL" | NAME1=="HAMILTON" | NAME1=="LAURA" | NAME1=="BESSIE" | ///
NAME1=="EDNA" | NAME1=="LILLIE" | NAME1=="" | NAME1=="ROSE" | NAME1=="THOMAS" | NAME1=="FRANK" | ///
NAME1=="LANDON" | NAME1=="GEORGE" | NAME1=="GEOGE" | NAME1=="VIVIAN" | NAME1=="STEPHEN" | ///
NAME1=="MABEL" | NAME1=="CLARA" | NAME1=="HOWARD" | NAME1=="HARRIET" | NAME1=="JEMES" | ///
NAME1=="ELAINE" | NAME1=="DOROTHY" | NAME1=="LILLY" | NAME1=="MAMIE" | NAME1=="NELL" | ///
NAME1=="STELLA" | NAME1=="PHILIP" | NAME1=="MAURICE" | NAME1=="MABLE" | NAME1=="LYNNE" | ///
NAME1=="PAULINE" | NAME1=="MARIE" | NAME1=="LINCOLN" | NAME1=="WARREN" | NAME1=="CLINTON" | ///
NAME1=="IRENE" | NAME1=="JUSTIN" | NAME1=="HERMAN" | NAME1=="MARIE" | NAME1=="MILDRED" | ///
NAME1=="MURIEL" | NAME1=="RAMON" | NAME1=="ELEANORA" | NAME1=="ARMSTRONG" | NAME1=="WALDO" | ///
NAME1=="WALTER" | NAME1=="DOROTHEA" | NAME1=="VIOLETTE" | NAME1=="CAROLINE" | NAME1=="PHILIPS" | ///
NAME1=="JESSIE" | NAME1=="JENNIE" | NAME1=="SOLOMON" | NAME1=="BLANCHE" | NAME1=="THEODORE" | ///
NAME1=="CLARENCE" | NAME1=="ENRIQUE" | NAME1=="JOSEPHINE" | NAME1=="HANCOCK" | NAME1=="ROSEMAY" | ///
NAME1=="CLARION" | NAME1=="ANTONIO" | NAME1=="PHILLIP" | NAME1=="CHESTER" | NAME1=="FRANCIS" | ///
NAME1=="DICKSON" | NAME1=="GERTUDE" | NAME1=="LENORE" | NAME1=="NETTIE" | NAME1=="NELSON" | ///
NAME1=="DORCAS" | NAME1=="JULIAN" | NAME1=="WESLEY" | NAME1=="OLIVER" | NAME1=="LOUISE" | ///
NAME1=="LEONOR" | NAME1=="SPRING" | NAME1=="ALBERT" | NAME1=="DANIAL" | NAME1=="ARCHIE" | ///
NAME1=="GRIGER" | NAME1=="HEWETT" | NAME1=="AUTHUR" | NAME1=="MORGAN" | NAME1=="SHELLY" | ///
NAME1=="GLORIA" | NAME1=="NELSON" | NAME1=="ISABEL" | NAME1=="JULIA" | NAME1=="DAISY" | ///
NAME1=="EDWIN" | NAME1=="NORMAN" | NAME1=="ALLAN" | NAME1=="PETER" | NAME1=="JANET" | ///
NAME1=="JACOB" | NAME1=="DAVID" | NAME1=="NANCY" | NAME1=="FLORA" 

// order by four-corner code, highest frquency within each, alphabetically for shared freq
// this means i can compare similar names in case matching character is unclear
// generate extended four-corner-code variable
gsort CHINESE1 -_freq NAME1
gen k4corner=""
order NAME1 CHINESE1 k4corner _freq FI length utf
format CHINESE1 %5s
format k4corner %9s
format FI %2s
format _freq length %2.0f

// convert from four-digit k4c to five (with decimal) for merge with unihan
// also ensures the correct character is identified (the surname) when there are multiple options
// cross-reference with mdbg.com character dictionary: search by k4c in memory
// ensure above sort first!!

replace k4corner ="0021.1" if CHINESE1=="0021"
replace k4corner ="0022.7" if CHINESE1=="0022" & FI=="F"
replace utf ="65B9" if CHINESE1=="0022" & FI=="F"
replace k4corner ="0022.2" if CHINESE1=="0022" & FI=="L"
replace k4corner ="0022.7" if CHINESE1=="0022" & (FI=="G" | FI=="K")
replace utf ="9AD8" if CHINESE1=="0022" & (FI=="G" | FI=="K")
replace k4corner ="0022.2" if CHINESE1=="0022" & FI=="W"
replace utf ="5ED6" if CHINESE1=="0022" & FI=="W"
replace k4corner ="0023.2" if CHINESE1=="0023" & FI=="H"
replace k4corner="0026.7" if CHINESE1=="0026" & FI=="T"
replace k4corner="0028.6" if CHINESE1=="0028"
replace k4corner="0040.0" if CHINESE1=="0040" & (FI=="M" | FI=="W")
replace k4corner="0040.6" if CHINESE1=="0040" & FI=="C"
replace k4corner="0121.1" if CHINESE1=="0121" & FI=="L"
replace k4corner="0164.6" if CHINESE1=="0164" & ((FI=="T" | FI=="H") & length==3)
replace k4corner="0164.6" if CHINESE1=="0164" & (NAME1=="HOME" | NAME1=="THOM" | NAME1=="HORN" | NAME1=="TAAM" | NAME1=="TAM THOM" | NAME1=="TAM TOM" | NAME1=="HOUM")
replace k4corner="0180.1" if CHINESE1=="0180"
replace k4corner="0365.0" if CHINESE1=="0365"
replace k4corner="0460.0" if CHINESE1=="0460" & (FI=="J" | FI=="D" | FI=="C" | NAME1=="TSE")
replace utf ="8B1D" if CHINESE1=="0460" & (FI=="J" | FI=="D" | FI=="C" | NAME1=="TSE")
replace k4corner="0722.7" if CHINESE1=="0722" & (FI=="K" | FI=="Q" | FI=="J" | (FI=="F" & _freq>1))
replace k4corner="0742.7" if CHINESE1=="0742" & (FI=="Q" | FI=="K" | FI=="G")
replace k4corner="0864.0" if CHINESE1=="0864" & (FI=="H" | NAME1=="SHU")
replace k4corner="1010.4" if CHINESE1=="1010" & (FI=="W" | FI=="O")
replace k4corner="1010.1" if CHINESE1=="1010" & NAME1=="SAM"
replace k4corner="1020.0" if CHINESE1=="1020" & (FI=="D" | FI=="T")
replace k4corner="1021.4" if CHINESE1=="1021" & NAME1!="YUEN"
replace k4corner="1040.0" if CHINESE1=="1040" & NAME1=="YEE"
replace k4corner="1060.3" if CHINESE1=="1060" & ((FI=="L" & _freq>1) | NAME1=="LEI" | NAME1=="LOUEY" | NAME1=="LOUY" | NAME1=="LUEY" | NAME=="LEWIE")
replace k4corner="1060.1" if CHINESE1=="1060" & FI=="N"
replace k4corner="1111.7" if CHINESE1=="1111"
replace k4corner="1123.2" if CHINESE1=="1123" & (_freq>2 | NAME1=="JEANG" | NAME1=="JEONG" | NAME1=="TCHANG" | NAME1=="GEUNG" | NAME1=="CHENG" | NAME1=="LEONG" | NAME1=="TSUNG" | NAME1=="TSCHANG" | NAME1=="TSANG" | NAME1=="CHAN" | NAME1=="CHANMRS" | NAME1=="GUNG" | NAME1=="JONG" | NAME1=="LEUNGMRS" | NAME1=="TJON")
replace k4corner="1224.7" if CHINESE1=="1224"
replace k4corner="1241.0" if CHINESE1=="1241" & NAME1!="LOWE"
replace k4corner="1249.3" if CHINESE1=="1249" & NAME1!="CHIA"
replace k4corner="1712.7" if CHINESE1=="1712" & (_freq>1 | NAME1=="DANG" | NAME1=="DUNG" | NAME1=="JUNG" | NAME1=="UNG" | NAME1=="AUNG")
replace k4corner="1750.7" if CHINESE1=="1750" // NOT 100% SURE ON THIS ONE
replace k4corner="1762.0+2428.1" if CHINESE1=="1762" & ((strpos(NAME1, "SO") > 0) | (strpos(NAME1, "HO") > 0) | (strpos(NAME1, "TO") > 0) | NAME1=="SUHU")
// wu
replace k4corner="2121.7" if CHINESE1=="2121" & (_freq>3 | NAME1=="WING" | NAME1=="WOO" | NAME1=="ANG")
replace utf="4F0D" if CHINESE1=="2121" & (_freq>3 | NAME1=="WING" | NAME1=="WOO" | NAME1=="ANG")
// hu
replace k4corner="2121.7" if CHINESE1=="2121" & NAME1=="HOO"
replace utf="864E" if CHINESE1=="2121" & NAME1=="HOO"
// lu
replace k4corner="2121.7" if CHINESE1=="2121" & (NAME1=="LOO" | NAME1=="LU" | NAME1=="LAO" | NAME1=="LO" | NAME1=="LOI" | NAME1=="LOON" | NAME1=="LOU" | NAME1=="LOW")
replace utf="76E7" if CHINESE1=="2121" & (NAME1=="LOO" | NAME1=="LU" | NAME1=="LAO" | NAME1=="LO" | NAME1=="LOI" | NAME1=="LOON" | NAME1=="LOU" | NAME1=="LOW")
replace k4corner="2122.0" if CHINESE1=="2122" & (_freq>1 | NAME1=="HALL" | NAME1=="HAW" | NAME1=="HAY")
replace k4corner="2140.6" if CHINESE1=="2140"
replace k4corner="2220.7" if CHINESE1=="2220" & FI=="S"
replace k4corner="2221.4" if CHINESE1=="2221" & (NAME1=="YAM" | NAME1=="YEN" | NAME1=="JIM")
replace k4corner="2277.0" if CHINESE1=="2277" // NOT SUPER SURE ON THIS
replace k4corner="2290.0" if CHINESE1=="2290" & NAME1=="LEE"
replace k4corner="2290.4 3290.4" if CHINESE1=="2290" & (NAME1=="LOCK" | NAME1=="LAY")
replace k4corner="2324.2" if CHINESE1=="2324" & NAME1=="FU"
replace k4corner="2510.0" if CHINESE1=="2510"
replace k4corner="2522.7" if CHINESE1=="2522" // NOT SUPER SURE ON THIS
replace k4corner="2590.0" if CHINESE1=="2590" & (_freq>1 | NAME1=="CHUE" | NAME1=="GEE CHEE" | NAME1=="JOE" | NAME1=="JUE")
replace k4corner="2590.6" if CHINESE1=="2590" & (NAME1=="CHANG" | NAME1=="CHONG")
replace k4corner="2599.6" if CHINESE1=="2599" & NAME1=="LIN"
replace k4corner="2600.0" if CHINESE1=="2600" & NAME=="PAI"
replace k4corner="2643.0" if CHINESE1=="2643" // only one option, not a named surname
replace k4corner="2691.4" if CHINESE1=="2691" & FI=="C"
replace k4corner="2692.2" if CHINESE1=="2692"
replace k4corner="2710.7" if CHINESE1=="2710"
replace k4corner="2713.2" if CHINESE1=="2713" & (FI=="L" | FI=="R")
replace k4corner="2721.2" if CHINESE1=="2721" & FI=="N"
replace k4corner="2722.0" if CHINESE1=="2722"
replace k4corner="2723.4" if CHINESE1=="2723"
replace k4corner="2771.2" if CHINESE1=="2771"
replace k4corner="2822.7" if CHINESE1=="2822" & FI=="L"
replace k4corner="2829.4" if CHINESE1=="2829" & ((FI=="C" & length<5) | FI=="T" | (FI=="S" & length==3))
replace k4corner="2950.2" if CHINESE1=="2950" // only one option, not a named surname
replace k4corner="3010.7" if CHINESE1=="3010"
replace k4corner="3040.4" if CHINESE1=="3040"
replace k4corner="3060.8" if CHINESE1=="3060" & length>3
replace k4corner="3073.2" if CHINESE1=="3073" // not super sure on this, only pronunciation match
replace k4corner="3077.7" if CHINESE1=="3077"
replace k4corner="3090.4" if CHINESE1=="3090" & FI=="S"
replace k4corner="3111.0" if CHINESE1=="3111" & (NAME1=="GONG" | NAME1=="KONG" | FI=="Q" | FI=="C")
replace k4corner="3112.7" if CHINESE1=="3112" & (FI=="F" & length>3)
replace k4corner="3119.6" if CHINESE1=="3119"
replace k4corner="3216.9" if CHINESE1=="3216" & FI=="P"
replace k4corner="3390.4" if CHINESE1=="3390" & FI=="L" & length>3 & NAME1!="LI MRS"
replace k4corner="3411.1" if CHINESE1=="3411" & (NAME1=="SEN" | NAME1=="SHING")
replace k4corner="3414.7" if CHINESE1=="3414" & FI=="L"
replace k4corner="3418.1" if CHINESE1=="3418" & NAME1=="HONG"
replace k4corner="3512.7" if CHINESE1=="3512"
replace k4corner="3521.8" if CHINESE1=="3521" // not super sure on this, only pronunciation match
replace k4corner="3530.0" if CHINESE1=="3530" & (NAME1=="LAN" | NAME1=="LEN")
replace k4corner="3611.7" if CHINESE1=="3611" & NAME1!="WONG"
replace k4corner="3612.7" if CHINESE1=="3612" & (FI=="H" | FI=="T")
replace k4corner="3722.7" if CHINESE1=="3722"
replace k4corner="4020.7" if CHINESE1=="4020" & FI=="M"
replace k4corner="4022.7" if CHINESE1=="4022" & FI=="N"
replace k4corner="4040.7" if CHINESE1=="4040" & ((FI=="L" & _freq>3) | strpos(NAME1, "LEE")>0 | NAME1=="BOWEN" | NAME1=="LEI") // maybe a few more but list super long with n/a's
replace k4corner="4073.2" if CHINESE1=="4073" & FI=="Y"
replace k4corner="4090.8" if CHINESE1=="4090"
replace k4corner="4212.2" if CHINESE1=="4212" & FI=="P"
replace k4corner="4241.3" if CHINESE1=="4241"
replace k4corner="4385.0" if CHINESE1=="4385" & (NAME1=="TAI" | NAME1=="DAI")
replace k4corner="4410.4" if CHINESE1=="4410"
replace k4corner="4421.4" if CHINESE1=="4421"
replace k4corner="4422.7" if CHINESE1=="4422" & FI=="S"
replace k4corner="4424.2" if CHINESE1=="4424" & (FI=="J" | FI=="C" | FI=="T")
replace k4corner="4439.4" if CHINESE1=="4439" & FI=="S"
replace k4corner="4442.7" if CHINESE1=="4442"
replace k4corner="4443.0" if CHINESE1=="4443" & FI=="M"
replace k4corner="4445.6" if CHINESE1=="4445"
replace k4corner="4450.4" if CHINESE1=="4450"
replace k4corner="4453.0" if CHINESE1=="4453"
replace k4corner="4471.1" if CHINESE1=="4471"
replace k4corner="4474.1" if CHINESE1=="4474" & FI=="S"
replace k4corner="4477.0" if CHINESE1=="4477"
replace k4corner="4480.6" if CHINESE1=="4480" & (strpos(NAME1, "NG")>0 | FI=="O")
replace k4corner="4490.4" if CHINESE1=="4490" & (FI=="Y" | strpos(NAME1, "IP")>0)
replace k4corner="4490.1" if CHINESE1=="4490" & (strpos(NAME1, "CH")>0 | FI=="T")
replace k4corner="4491.0" if CHINESE1=="4491" & (FI=="D" | FI=="T")
replace k4corner="4499.0" if CHINESE1=="4499" & (NAME1=="YING" | (FI=="L" & length==4))
replace k4corner="4692.7" if CHINESE1=="4692" & FI=="Y"
replace k4corner="4762.0" if CHINESE1=="4762" & (strpos(NAME1, "U")>0 | strpos(NAME1, "OO")>0 | NAME1=="WO")
replace k4corner="4895.7" if CHINESE1=="4895" & (FI=="M" | FI=="H")
replace k4corner="4980.2" if CHINESE1=="4980" & (FI=="C" | FI=="J")
replace k4corner="5090.4" if CHINESE1=="5090"
replace k4corner="5302.7" if CHINESE1=="5302" // not super sure on this, only pronunciation match
replace k4corner="5320.0" if CHINESE1=="5320"
replace k4corner="5560.6" if CHINESE1=="5560" & (FI=="C" | (FI=="T" & length==3))
replace k4corner="5198.6" if CHINESE1=="5798" // specifying simplified here
replace k4corner="6015.3" if CHINESE1=="6015"
replace k4corner="6022.7" if CHINESE1=="6022"
replace k4corner="6040.0" if CHINESE1=="6040" & FI!="L"
/// 6043 not coming up, but should really try to find. ng/ong/ing
replace k4corner="6060.0" if CHINESE1=="6060" & FI=="L"
replace k4corner="6091.4" if CHINESE1=="6091"
replace k4corner="6101.4" if CHINESE1=="6101" // not super sure on this, only pronunciation match
replace k4corner="6138.6" if CHINESE1=="6138"
replace k4corner="6624.8" if CHINESE1=="6624"
replace k4corner="6702.0" if CHINESE1=="6702"
replace k4corner="7121.1" if CHINESE1=="7121" & (FI=="Y" | strpos(NAME1,"OO")>0 | FI=="U")
replace k4corner="7132.7" if CHINESE1=="7132" & FI=="M"
replace k4corner="7171.6" if CHINESE1=="7171" & (FI=="O" | FI=="A")
replace k4corner="7210.0" if CHINESE1=="7210" & ((FI=="L" & length<5) | NAME1=="LOUIE")
replace k4corner="7210.1" if CHINESE1=="7210" &  ((FI=="H" & NAME1!="HING") | NAME1=="SHEW" | NAME1=="JU")
replace k4corner="7421.4" if CHINESE1=="7421" & FI=="L"
replace k4corner="7529.6" if CHINESE1=="7529" & (strpos(NAME1, "CHE")>0 | strpos(NAME1, "CHA")>0 | strpos(NAME1, "CHIN")>0 | strpos(NAME1, "TS")>0 | strpos(NAME1, "CHU")>0 | FI=="Z" | NAME1=="TONG" | strpos(NAME1, "JU")>0 | (strpos(NAME1, "ING")>0 & length==4))
replace k4corner="7712.7" if CHINESE1=="7712"
replace k4corner="7722.0" if CHINESE1=="7722" & (FI=="J" | (FI=="C" & length<5) | FI=="T")
replace k4corner="7727.2" if CHINESE1=="7727"  // not super sure on this, only pronunciation match
replace k4corner="7736.4" if CHINESE1=="7736"
replace k4corner="7740.7" if CHINESE1=="7740" // not super sure on this, only pronunciation match
replace k4corner="7777.2" if CHINESE1=="7777" & (FI=="Q" | FI=="K" | FI=="G" | FI=="C")
replace k4corner="7778.2" if CHINESE1=="7778" & NAME1!="EARL"
replace k4corner="7922.7" if CHINESE1=="7922" & FI=="S" // not super sure on this, only pronunciation match
replace k4corner="8010.9" if CHINESE1=="8010" & length<6
replace CHINESE1="8012" if CHINESE1=="8021" // likely typo
replace k4corner="8012.7" if CHINESE1=="8012" & (FI=="Y" | FI=="J")
replace _freq=21 if k4corner=="8012.7" & NAME1=="YOUNG"
duplicates drop NAME1 k4corner if k4corner=="8012.7", force // getting rid of this created dupe
replace k4corner="8060.6" if CHINESE1=="8060" & ((strpos(NAME1, "NG")>0 & FI!="L" & FI!="Y") | FI=="J")
replace k4corner="8090.4" if CHINESE1=="8090" & (FI=="Y" | FI=="N" | FI=="G")
replace k4corner="8211.4" if CHINESE1=="8211" & ((strpos(NAME1, "NG")>0 & FI!="N") | strpos(NAME1, "CHO")>0 | strpos(NAME1, "CHU")>0 | FI=="J")
replace k4corner="8742.7" if CHINESE1=="8742" & (strpos(NAME1, "NG")>0 | FI=="C")
replace k4corner="8824.3" if CHINESE1=="8824"
replace k4corner="8877.7" if CHINESE1=="8877" & FI!="Y"
replace k4corner="9021.1" if CHINESE1=="9021"
replace k4corner="9182.7" if CHINESE1=="9182" & FI=="B" // not super sure on this, only pronunciation match
replace k4corner="9801.6" if CHINESE1=="9801" // not super sure on this, only pronunciation match
replace k4corner="9942.7" if CHINESE1=="9942" & FI=="L"
replace k4corner="9990.4" if CHINESE1=="9990"

ren _freq NYC_freq
drop if k4corner==""
ren NAME1 name1_nyc
ren CHINESE1 cornercode_nyc

// format utf codes for merge
replace utf = "U+" + utf if utf!="."

save "$data/NYC crosswalk.dta", replace

********************************
***** MERGE UNIHAN AND NYC *****
********************************

// set frames for building dictionaries
// first export those that match already found utf codes
frame change default
use "$data/NYC crosswalk.dta", clear
merge m:1 k4corner utf using "$data/unihan-merge.dta"
keep if _merge==3
drop _merge
format k4corner %16s
frame copy default dict, replace

// next export those that are designated as surnames
// remove those already matched
// next narrow a few k4corners that have >1 surname associated
frame change default
use "$data/unihan-merge.dta", clear
keep if strpos(kDef, "surname")>0
drop if k4corner=="2121.7"
drop if k4corner=="4424.2" & utf!="U+8523"
drop if k4corner=="4480.6" & utf!="U+9EC3"
drop if k4corner=="7778.2" & utf!="U+6B27"
duplicates drop k4corner, force // this is okay because the other dupes are already matched
merge 1:m k4corner using "$data/NYC crosswalk.dta"
frame put if _merge==3, into(surnames)
frame change dict
frameappend surnames
frame dict: drop _merge
duplicates drop name1_nyc k4corner, force // must have shown up in both
frame drop surnames
tempfile temp
save `temp', replace

// manually add the remaining names
frame change default
use "$data/NYC crosswalk.dta", clear
merge 1:1 k4corner name1_nyc using `temp'
keep if _merge==1
drop _merge utf kCantonese kDef kPY kFrequency
mmerge k4corner using "$data/unihan-merge.dta"
drop if _merge==2
order name1_nyc k4corner kCantonese kPY utf kFrequency kDef FI
sort k4corner name1_nyc

// one not found in unihan
replace utf="U+63EB" if k4corner=="2950.2"
replace kPY="jiū" if k4corner=="2950.2"
replace kDef="grasp with hand, pinch" if k4corner=="2950.2"
replace kCantonese="cau1 zau1" if k4corner=="2950.2"
drop _merge

// narrowing to correct utf codes
drop if k4corner=="0023.2" & utf!="U+5EB7"
drop if k4corner=="0365.0" & utf!="U+8AA0"
drop if k4corner=="1010.1" & utf!="U+4E09"
drop if k4corner=="1060.1" & utf!="U+543E"
drop if k4corner=="2277.0" & utf!="U+5E7D"
drop if k4corner=="2290.0" & utf!="U+5229"
drop if k4corner=="2522.7" & utf!="U+4F5B"
drop if k4corner=="2600.0" & utf!="U+767D"
drop if k4corner=="2723.4" & utf!="U+5019"
drop if k4corner=="2822.7" & utf!="U+502B"
drop if k4corner=="2829.4" & utf!="U+5F90"
drop if k4corner=="3040.4" & utf!="U+5B89"
drop if k4corner=="3060.8" & utf!="U+5BB9"
drop if k4corner=="3073.2" & utf!="U+826F"
drop if k4corner=="3414.7" & utf!="U+6DE9"
drop if k4corner=="3418.1" & utf!="U+6D2A"
drop if k4corner=="3612.7" & utf!="U+6E6F"
drop if k4corner=="3722.7" & utf!="U+7941"
drop if k4corner=="4020.7" & utf!="U+9EA5"
drop if k4corner=="4022.7" & utf!="U+5357"
drop if k4corner=="4442.7" & utf!="U+842C"
drop if k4corner=="4443.0" & utf!="U+83AB"
drop if k4corner=="4453.0" & utf!="U+82F1"
drop if k4corner=="4471.1" & utf!="U+8001"
drop if k4corner=="4477.0" & utf!="U+7518"
drop if k4corner=="4490.4" & utf!="U+8449"
drop if k4corner=="5302.7" & utf!="U+8F14"
drop if k4corner=="5320.0" & utf!="U+6210"
drop if k4corner=="6022.7" & utf!="U+6613"
drop if k4corner=="6040.0" & utf!="U+7530"
drop if k4corner=="6091.4" & utf!="U+7F85"
drop if k4corner=="6101.4" & utf!="U+65FA"
drop if k4corner=="6702.0" & utf!="U+660E"
drop if k4corner=="7722.0" & utf!="U+5468"
drop if k4corner=="7727.2" & utf!="U+5C48"
drop if k4corner=="7922.7" & utf!="U+52DD"
drop if k4corner=="8012.7" & utf!="U+7FC1"
drop if k4corner=="8060.6" & utf!="U+66FE"

frame copy default surnames, replace
frame change dict
frameappend surnames
order name1_nyc utf k4corner kCantonese kPY kDef cornercode_nyc NYC_freq
sort name1_nyc
save "$data/crosswalk_long.dta", replace


****************************************
****** ADD TOTAL NAME FREQUENCIES ******
****************************************

ren name1_nyc name1
ren NYC_freq freq_c
keep name1 utf kPY freq_c
gsort name1 -freq_c
by name1: gen char = _n

reshape wide utf freq_c kPY, i(name1) j(char)

replace utf2="." if utf2==""
replace utf3="." if utf3==""
replace utf4="." if utf4==""
replace utf5="." if utf5==""
replace utf6="." if utf6==""
replace utf7="." if utf7==""
replace utf8="." if utf8==""
replace utf9="." if utf9==""

replace kPY2="." if kPY2==""
replace kPY3="." if kPY3==""
replace kPY4="." if kPY4==""
replace kPY5="." if kPY5==""
replace kPY6="." if kPY6==""
replace kPY7="." if kPY7==""
replace kPY8="." if kPY8==""
replace kPY9="." if kPY9==""

tempfile temp
save `temp', replace

use "$data/crosswalk_long.dta", clear
bys name1_nyc: egen name1_freq=sum(NYC_freq)
ren name1_nyc name1
duplicates drop name1 name1_freq, force
keep name1 name1_freq

merge 1:1 name1 using `temp'
drop _merge

ren utf* char*_utf
ren kPY* char*_py
ren freq_c* char*_freq

gen char1_prop = char1_freq/name1_freq
gen char2_prop = char2_freq/name1_freq
gen char3_prop = char3_freq/name1_freq
gen char4_prop = char4_freq/name1_freq
gen char5_prop = char5_freq/name1_freq
gen char6_prop = char6_freq/name1_freq

format char*_prop %7.3f 

order name1 name1_freq char1_utf char1_py char1_freq char1_prop char2* char3* char4* char5* char6* char7*

save "$data/crosswalk_final_wide.dta", replace
